Machine Learning Algorithms

INDEX

1.Linear regression for Prediction of Suicide count from 2013-2033

2.With respect to Causes,Year

3.State,Year

4.Age grp,Year

5.Wrt State,Age grp,Year Culminated Model

0)Libraries,Extraction , cleaning and separating:

df<-read.table('Suicides in India 2001-2012.csv',header=T, sep=",")
#head(df)

#install.packages('tidyverse')
#install.packages('caret')
#install.packages("plotly")
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.1
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.3     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## Warning: package 'readr' was built under R version 4.1.1
## Warning: package 'stringr' was built under R version 4.1.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(caret)
## Warning: package 'caret' was built under R version 4.1.1
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(dplyr)
library(ggplot2)
library(plotly)
## Warning: package 'plotly' was built under R version 4.1.1
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.1.1
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.1-2
#Data Cleaning
#Replacing Values for UT
df["State"][df["State"] == "A & N Islands"]<-"A & N Islands-Ut"
df["State"][df["State"] == "Chandigarh"]<-"Chandigarh-Ut"
df["State"][df["State"] == "D & N Haveli"]<-"D & N Haveli-Ut"
df["State"][df["State"] == "Daman & Diu"]<-"Daman & Diu-Ut"
df["State"][df["State"] == "Lakshadweep"]<-"Lakshadweep-Ut"
df["State"][df["State"] == "Delhi"] <-"Delhi-Ut"
#head(df)
#Renaming causes
df["Type"][df["Type"]=="Bankruptcy or Sudden change in Economic"]<-"Sudden change in Economic Status or Bankruptcy"
df["Type"][df["Type"]=="By Other means (please specify)"]<-"By Other means"
df["Type"][df["Type"]=="Not having Children(Barrenness/Impotency"]<-"Not having Children(Impotency)"
df["Type"][df["Type"]=="By Jumping from (Building)"]<-"By Jumping from Building"
df["Type"][df["Type"]=="Hr. Secondary/Intermediate/Pre-Universit"]<-"Hr. Secondary/Intermediate/Pre-University"
df["Type"][df["Type"]=="Failure in Examination"]<-"Examination Failure"
df["Type"][df["Type"]=="By coming under running vehicles/trains"]<-"By road or railway accidents" 
df["Type"][df["Type"]=="Bankruptcy or Sudden change in Economic Status"]<-"Sudden change in Economic Status or Bankruptcy"
df["Type"][df["Type"]=="Not having Children (Barrenness/Impotency"]<-"Not having Children(Impotency)"
#causescount
#head(df)

#drop the unwanted State-titles
df1 <- df[!(df$State=="Total (Uts)" | df$State=="Total (All India)" |  df$State=="Total (States)"),]
#drop the values ==0 under Total
df2 <- df1[!(df1$Total==0),]
# drop the unwanted Types
df2 <- df2[!(df2$Type=="By Other means" | df2$Type=="Others (Please Specify)" | df2$Type=="Causes Not known" |  df2$Type=="Other Causes (Please Specity)"),]

#Spliting the dataframe into smaller dataframe based on the column "Type_code"
causesdf=filter(df2,df2$Type_code=="Causes")
edudf=filter(df2,df2$Type_code=="Education_Status")
meansdf=filter(df2,df2$Type_code=="Means_adopted")
professionaldf=filter(df2,df2$Type_code=="Professional_Profile")
socialdf=filter(df2,df2$Type_code=="Social_Status")

1.1)Linear Regression for prediction for suicide count from 2013-2033

# Extracting the needed year and suicide count columns
suicide_count_overyears <- df2 %>% group_by(Year) %>% summarise(total_case=sum(Total))
suicide_count_overyears
## # A tibble: 12 x 2
##     Year total_case
##    <int>      <int>
##  1  2001     467928
##  2  2002     476738
##  3  2003     482322
##  4  2004     486323
##  5  2005     486115
##  6  2006     512676
##  7  2007     522233
##  8  2008     531216
##  9  2009     539470
## 10  2010     564083
## 11  2011     564376
## 12  2012     547894
#Testing co relation
cor(suicide_count_overyears$Year,suicide_count_overyears$total_case)
## [1] 0.962955
cor.test(suicide_count_overyears$Year,suicide_count_overyears$total_case)
## 
##  Pearson's product-moment correlation
## 
## data:  suicide_count_overyears$Year and suicide_count_overyears$total_case
## t = 11.292, df = 10, p-value = 5.163e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8696678 0.9898336
## sample estimates:
##      cor 
## 0.962955
#Partitioning into train and test
set.seed(123)
train_samples <- suicide_count_overyears$Year %>%
  createDataPartition(p=0.65,list=FALSE)
#train_samples
head(train_samples)
##      Resample1
## [1,]         1
## [2,]         3
## [3,]         5
## [4,]         6
## [5,]         8
## [6,]         9
train <- suicide_count_overyears[train_samples,]
test <- suicide_count_overyears[-train_samples,]
#train
#test
#suicide_count_overyears

# LR Model creation
model <- lm(total_case~Year,data=train)
summary(model)
## 
## Call:
## lm(formula = total_case ~ Year, data = train)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -13947  -3539   3390   4469  11714 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -17077042    1946684  -8.772 0.000122 ***
## Year             8767        970   9.038 0.000103 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9839 on 6 degrees of freedom
## Multiple R-squared:  0.9316, Adjusted R-squared:  0.9202 
## F-statistic: 81.68 on 1 and 6 DF,  p-value: 0.0001028
#Make predictions
pred <- model %>%
  predict(test)

pred
##        1        2        3        4 
## 473762.3 491295.5 517595.5 543895.4
#Verification with actual value and predicted values
RMSE <- RMSE(pred,test$total_case)
RMSE
## [1] 10754.4
R2 <- R2(pred,test$total_case)
R2
## [1] 0.9753916
hist(model$residuals)

qqnorm(model$residuals,ylab = "Residuals")
qqline(model$residuals)

# Prediction
future_years=data.frame(Year=c(2013:2033))
future_years$total_case <- model %>%
  predict(future_years)

future_years
##    Year total_case
## 1  2013   570195.3
## 2  2014   578961.9
## 3  2015   587728.5
## 4  2016   596495.2
## 5  2017   605261.8
## 6  2018   614028.4
## 7  2019   622795.1
## 8  2020   631561.7
## 9  2021   640328.4
## 10 2022   649095.0
## 11 2023   657861.6
## 12 2024   666628.3
## 13 2025   675394.9
## 14 2026   684161.5
## 15 2027   692928.2
## 16 2028   701694.8
## 17 2029   710461.4
## 18 2030   719228.1
## 19 2031   727994.7
## 20 2032   736761.3
## 21 2033   745528.0
# Bar plot for rise in suicide cases
total_suicide_count <- rbind(suicide_count_overyears,future_years)
#total
fig <- plot_ly(
  x = total_suicide_count$Year,
  y = total_suicide_count$total_case,
  name = "Variation of suicide count over the years 2013-2032",
  type = "bar",

)
fig <- fig %>% layout(title = "Suicide Trend Over the years",
                      barmode = 'group',
                      xaxis = list(title = "Years"),
                      yaxis = list(title = "Count"))

fig

1.2)Logistic Regression for prediction for suicide count from 2013-2033

# Extracting the needed year and suicide count columns
suicide_count_overyears <- df2 %>% group_by(Year) %>% summarise(total_case=sum(Total))
suicide_count_overyears
## # A tibble: 12 x 2
##     Year total_case
##    <int>      <int>
##  1  2001     467928
##  2  2002     476738
##  3  2003     482322
##  4  2004     486323
##  5  2005     486115
##  6  2006     512676
##  7  2007     522233
##  8  2008     531216
##  9  2009     539470
## 10  2010     564083
## 11  2011     564376
## 12  2012     547894
#Testing co relation
cor(suicide_count_overyears$Year,suicide_count_overyears$total_case)
## [1] 0.962955
cor.test(suicide_count_overyears$Year,suicide_count_overyears$total_case)
## 
##  Pearson's product-moment correlation
## 
## data:  suicide_count_overyears$Year and suicide_count_overyears$total_case
## t = 11.292, df = 10, p-value = 5.163e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8696678 0.9898336
## sample estimates:
##      cor 
## 0.962955
#Partitioning into train and test
set.seed(123)
train_samples <- suicide_count_overyears$Year %>%
  createDataPartition(p=0.65,list=FALSE)
#train_samples
head(train_samples)
##      Resample1
## [1,]         1
## [2,]         3
## [3,]         5
## [4,]         6
## [5,]         8
## [6,]         9
train <- suicide_count_overyears[train_samples,]
test <- suicide_count_overyears[-train_samples,]
train
## # A tibble: 8 x 2
##    Year total_case
##   <int>      <int>
## 1  2001     467928
## 2  2003     482322
## 3  2005     486115
## 4  2006     512676
## 5  2008     531216
## 6  2009     539470
## 7  2011     564376
## 8  2012     547894
test
## # A tibble: 4 x 2
##    Year total_case
##   <int>      <int>
## 1  2002     476738
## 2  2004     486323
## 3  2007     522233
## 4  2010     564083
as.factor(suicide_count_overyears$Total)
## Warning: Unknown or uninitialised column: `Total`.
## factor(0)
## Levels:
# Training model
logistic_modelLR <- glm(total_case~Year, data = train)
logistic_modelLR
## 
## Call:  glm(formula = total_case ~ Year, data = train)
## 
## Coefficients:
## (Intercept)         Year  
##   -17077042         8767  
## 
## Degrees of Freedom: 7 Total (i.e. Null);  6 Residual
## Null Deviance:       8.487e+09 
## Residual Deviance: 580800000     AIC: 173.5
pred <- logistic_modelLR %>%
  predict(test)
pred
##        1        2        3        4 
## 473762.3 491295.5 517595.5 543895.4
RMSE <- RMSE(pred,test$total_case)
RMSE
## [1] 10754.4
R2 <- R2(pred,test$total_case)
R2
## [1] 0.9753916
test
## # A tibble: 4 x 2
##    Year total_case
##   <int>      <int>
## 1  2002     476738
## 2  2004     486323
## 3  2007     522233
## 4  2010     564083

2)CAUSES Filtration

topcauses<-causesdf%>%select(Type,Year,Total) %>% group_by(Type)%>% 
  summarise(Total=sum(Total)) %>% arrange(desc(Total))%>% head(10)
topcauses<-as.data.frame(topcauses)
head(topcauses)
##                                             Type  Total
## 1                                Family Problems 341952
## 2                        Other Prolonged Illness 194565
## 3                        Insanity/Mental Illness  94229
## 4                                   Love Affairs  45039
## 5 Sudden change in Economic Status or Bankruptcy  35410
## 6                                        Poverty  32684
topcauses1<-causesdf%>%select(Type,Year,Total) %>% group_by(Year,Type="Family Problems")%>%
  summarise(Total=sum(Total)) %>% arrange(desc(Type))
## `summarise()` has grouped output by 'Year'. You can override using the `.groups` argument.
head(topcauses1)
## # A tibble: 6 x 3
## # Groups:   Year [6]
##    Year Type            Total
##   <int> <chr>           <int>
## 1  2001 Family Problems 74067
## 2  2002 Family Problems 75891
## 3  2003 Family Problems 78419
## 4  2004 Family Problems 78690
## 5  2005 Family Problems 77022
## 6  2006 Family Problems 85675
topcauses2<-causesdf%>%select(Type,Year,Total) %>% group_by(Year,Type="Other Prolonged Illness")%>%
  summarise(Total=sum(Total)) %>% arrange(desc(Type))
## `summarise()` has grouped output by 'Year'. You can override using the `.groups` argument.
head(topcauses2)
## # A tibble: 6 x 3
## # Groups:   Year [6]
##    Year Type                    Total
##   <int> <chr>                   <int>
## 1  2001 Other Prolonged Illness 74067
## 2  2002 Other Prolonged Illness 75891
## 3  2003 Other Prolonged Illness 78419
## 4  2004 Other Prolonged Illness 78690
## 5  2005 Other Prolonged Illness 77022
## 6  2006 Other Prolonged Illness 85675
topcauses3<-causesdf%>%select(Type,Year,Total) %>% group_by(Year,Type="Insanity/Mental Illness")%>%
  summarise(Total=sum(Total)) %>% arrange(desc(Type))
## `summarise()` has grouped output by 'Year'. You can override using the `.groups` argument.
head(topcauses3)
## # A tibble: 6 x 3
## # Groups:   Year [6]
##    Year Type                    Total
##   <int> <chr>                   <int>
## 1  2001 Insanity/Mental Illness 74067
## 2  2002 Insanity/Mental Illness 75891
## 3  2003 Insanity/Mental Illness 78419
## 4  2004 Insanity/Mental Illness 78690
## 5  2005 Insanity/Mental Illness 77022
## 6  2006 Insanity/Mental Illness 85675
topcauses4<-causesdf%>%select(Type,Year,Total) %>% group_by(Year,Type="Love Affairs")%>%
  summarise(Total=sum(Total)) %>% arrange(desc(Type))
## `summarise()` has grouped output by 'Year'. You can override using the `.groups` argument.
head(topcauses4)
## # A tibble: 6 x 3
## # Groups:   Year [6]
##    Year Type         Total
##   <int> <chr>        <int>
## 1  2001 Love Affairs 74067
## 2  2002 Love Affairs 75891
## 3  2003 Love Affairs 78419
## 4  2004 Love Affairs 78690
## 5  2005 Love Affairs 77022
## 6  2006 Love Affairs 85675
topcauses5<-causesdf%>%select(Type,Year,Total) %>% group_by(Year,Type="Sudden change in Economic Status or Bankruptcy")%>%summarise(Total=sum(Total)) %>% arrange(desc(Type))
## `summarise()` has grouped output by 'Year'. You can override using the `.groups` argument.
head(topcauses5)
## # A tibble: 6 x 3
## # Groups:   Year [6]
##    Year Type                                           Total
##   <int> <chr>                                          <int>
## 1  2001 Sudden change in Economic Status or Bankruptcy 74067
## 2  2002 Sudden change in Economic Status or Bankruptcy 75891
## 3  2003 Sudden change in Economic Status or Bankruptcy 78419
## 4  2004 Sudden change in Economic Status or Bankruptcy 78690
## 5  2005 Sudden change in Economic Status or Bankruptcy 77022
## 6  2006 Sudden change in Economic Status or Bankruptcy 85675
top5causes=rbind(topcauses1,topcauses2,topcauses3,topcauses4,topcauses5)

2.1)CAUSES MLR

set.seed(123)
train_samples <- top5causes$Year %>%
  createDataPartition(p=0.70,list=FALSE)

train <- top5causes[train_samples,]
test <- top5causes[-train_samples,]


# LR Model creation
model <- lm(Total~Year+Type,data=train)
summary(model)
## 
## Call:
## lm(formula = Total ~ Year + Type, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7283.0 -2113.2   360.6  2606.4  4655.2 
## 
## Coefficients:
##                                                      Estimate Std. Error
## (Intercept)                                        -2101988.0   308256.4
## Year                                                   1088.5      153.7
## TypeInsanity/Mental Illness                             745.2     1718.4
## TypeLove Affairs                                        367.2     1542.1
## TypeOther Prolonged Illness                            -710.7     1636.9
## TypeSudden change in Economic Status or Bankruptcy     -511.1     1638.1
##                                                    t value Pr(>|t|)    
## (Intercept)                                         -6.819 4.33e-08 ***
## Year                                                 7.084 1.89e-08 ***
## TypeInsanity/Mental Illness                          0.434    0.667    
## TypeLove Affairs                                     0.238    0.813    
## TypeOther Prolonged Illness                         -0.434    0.667    
## TypeSudden change in Economic Status or Bankruptcy  -0.312    0.757    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3523 on 38 degrees of freedom
## Multiple R-squared:  0.5854, Adjusted R-squared:  0.5309 
## F-statistic: 10.73 on 5 and 38 DF,  p-value: 1.754e-06
#Make predictions
pred <- model %>%
  predict(test)

pred
##        1        2        3        4        5        6        7        8 
## 86973.33 78643.20 80820.19 84085.67 85174.16 77922.11 79010.60 80099.09 
##        9       10       11       12       13       14       15       16 
## 81187.59 87718.55 77544.08 85163.54 75577.28 82108.25 83196.74 85373.73
#Verification with actual value and predicted values
RMSE <- RMSE(pred,test$Total)
RMSE
## [1] 2926.94
R2 <- R2(pred,test$Total)
R2
## [1] 0.8575155
hist(model$residuals)

qqnorm(model$residuals,ylab = "Residuals")
qqline(model$residuals)

# Prediction
new.speeds <- data.frame(
  Year = c(2023, 2024, 2025) , Type = c("Family Problems","Love Affairs","Other Prolonged Illness") 
)
#(agedf)
predict(model, newdata = new.speeds)
##        1        2        3 
## 100035.3 101490.9 101501.6
future_years
##    Year total_case
## 1  2013   570195.3
## 2  2014   578961.9
## 3  2015   587728.5
## 4  2016   596495.2
## 5  2017   605261.8
## 6  2018   614028.4
## 7  2019   622795.1
## 8  2020   631561.7
## 9  2021   640328.4
## 10 2022   649095.0
## 11 2023   657861.6
## 12 2024   666628.3
## 13 2025   675394.9
## 14 2026   684161.5
## 15 2027   692928.2
## 16 2028   701694.8
## 17 2029   710461.4
## 18 2030   719228.1
## 19 2031   727994.7
## 20 2032   736761.3
## 21 2033   745528.0

2.2)Causes Lasso

#Testing co relation
#cor(suicide_count_overyears$Year,suicide_count_overyears$total_case)
#cor.test(suicide_count_overyears$Year,suicide_count_overyears$total_case)

#Partitioning into train and test
set.seed(123)
train_samples <- top5causes$Total %>%
  createDataPartition(p=0.80,list=FALSE)

train <- top5causes[train_samples,]
test <- top5causes[-train_samples,]
#agedf

#install.packages("glmnet")


#perform k-fold cross-validation to find optimal lambda value
cv_model <- cv.glmnet(data.matrix(train[, c('Year','Type')]), train$Total, alpha = 0.5)
cv_model
## 
## Call:  cv.glmnet(x = data.matrix(train[, c("Year", "Type")]), y = train$Total,      alpha = 0.5) 
## 
## Measure: Mean-Squared Error 
## 
##     Lambda Index  Measure      SE Nonzero
## min  546.3    30 11625269 1598685       1
## 1se 1668.4    18 12943740 1230864       1
#find optimal lambda value that minimizes test MSE
best_lambda <- cv_model$lambda.min
best_lambda
## [1] 546.3268
#[1] best_lambda=546.3268

#produce plot of test MSE by lambda value
plot(cv_model)

#Best Lasso model
#t=data.matrix(train[, c('Year','State','Age_group')])
#t
best_model <- glmnet(data.matrix(train[, c('Year','Type')]), train$Total, alpha = 0.5, lambda = best_lambda)
coef(best_model)
## 3 x 1 sparse Matrix of class "dgCMatrix"
##                       s0
## (Intercept) -1973496.614
## Year            1024.534
## Type               .
#Prediction

#define new observation
#new = matrix(c(2015,'Maharashtra',"45-59"), nrow=1, ncol=3) 
#data.matrix(c(2015,'Maharashtra',"45-59"))
#new
#use lasso regression model to predict response value
#predict(best_model, s = best_lambda, newx = new)

x=data.matrix(test[, c('Year','Type')])
#x
y=test$Total
#Metrics
y_predicted <- predict(best_model, s = best_lambda, newx = x)


RMSE <- RMSE(y,y_predicted)
RMSE
## [1] 2539.525
R2 <- R2(y,y_predicted)
R2
##         [,1]
## s1 0.9194946
#find SST and SSE
#sst <- sum((y - mean(y))^2)
#sse <- sum((y_predicted - y)^2)

#find R-Squared
#rsq <- 1-sse/sst
#rsq

2.3)SVM for Causes

library(e1071)
## Warning: package 'e1071' was built under R version 4.1.1
set.seed(123)

train_samples <- top5causes$Total %>% createDataPartition(p=0.65,list=FALSE)
train <- top5causes[train_samples,]
train$Type<-as.factor(train$Type)
test <- top5causes[-train_samples,]
test$Type<-as.factor(test$Type)
#agedf

# MLR Model creation
causessvm <- svm(Total~Year+Type,data=train, kernel = 'linear')
summary(causessvm)
## 
## Call:
## svm(formula = Total ~ Year + Type, data = train, kernel = "linear")
## 
## 
## Parameters:
##    SVM-Type:  eps-regression 
##  SVM-Kernel:  linear 
##        cost:  1 
##       gamma:  0.1666667 
##     epsilon:  0.1 
## 
## 
## Number of Support Vectors:  38
#Make predictions
pred <- causessvm %>%predict(test)
pred
##        1        2        3        4        5        6        7        8 
## 74980.24 79305.54 83630.83 85072.60 77481.51 83248.57 84690.34 89015.64 
##        9       10       11       12       13       14       15       16 
## 90457.40 76419.51 77861.28 79303.05 80744.81 87953.64 76338.50 83547.33 
##       17       18       19       20 
## 74977.75 82186.58 86511.88 87953.64
#Verification with actual value and predicted values
RMSE <- RMSE(pred,test$Total)
RMSE
## [1] 2570.844
R2 <- R2(pred,test$Total)
R2
## [1] 0.7645348

2.4)Logistic Regression for Causes

#Partitioning into train and test
set.seed(123)
train_samples <- top5causes$Total %>%
  createDataPartition(p=0.80,list=FALSE)

train <- top5causes[train_samples,]
test <- top5causes[-train_samples,]


as.factor(top5causes$Total)
##  [1] 74067 75891 78419 78690 77022 85675 84575 86225 85364 90476 89927 81524
## [13] 74067 75891 78419 78690 77022 85675 84575 86225 85364 90476 89927 81524
## [25] 74067 75891 78419 78690 77022 85675 84575 86225 85364 90476 89927 81524
## [37] 74067 75891 78419 78690 77022 85675 84575 86225 85364 90476 89927 81524
## [49] 74067 75891 78419 78690 77022 85675 84575 86225 85364 90476 89927 81524
## 12 Levels: 74067 75891 77022 78419 78690 81524 84575 85364 85675 ... 90476
# Training model
logistic_modelC <- glm(Total~Year+Type, data = train)
logistic_modelC
## 
## Call:  glm(formula = Total ~ Year + Type, data = train)
## 
## Coefficients:
##                                        (Intercept)  
##                                         -2.237e+06  
##                                               Year  
##                                          1.156e+03  
##                        TypeInsanity/Mental Illness  
##                                         -6.572e+01  
##                                   TypeLove Affairs  
##                                         -1.386e+01  
##                        TypeOther Prolonged Illness  
##                                         -2.740e+02  
## TypeSudden change in Economic Status or Bankruptcy  
##                                         -2.213e+02  
## 
## Degrees of Freedom: 47 Total (i.e. Null);  42 Residual
## Null Deviance:       1.306e+09 
## Residual Deviance: 515300000     AIC: 927.3
pred <- logistic_modelC %>%
  predict(test)
pred
##        1        2        3        4        5        6        7        8 
## 78000.87 83781.38 84937.48 77053.07 78209.17 79365.28 80521.38 82833.58 
##        9       10       11       12 
## 86301.89 85197.64 75741.42 86146.34
RMSE <- RMSE(pred,test$Total)
RMSE
## [1] 2281.396
R2 <- R2(pred,test$Total)
R2
## [1] 0.9157739
test
## # A tibble: 12 x 3
## # Groups:   Year [9]
##     Year Type                                           Total
##    <int> <chr>                                          <int>
##  1  2003 Other Prolonged Illness                        78419
##  2  2008 Other Prolonged Illness                        86225
##  3  2009 Other Prolonged Illness                        85364
##  4  2002 Insanity/Mental Illness                        75891
##  5  2003 Insanity/Mental Illness                        78419
##  6  2004 Insanity/Mental Illness                        78690
##  7  2005 Insanity/Mental Illness                        77022
##  8  2007 Insanity/Mental Illness                        84575
##  9  2010 Insanity/Mental Illness                        90476
## 10  2009 Love Affairs                                   85364
## 11  2001 Sudden change in Economic Status or Bankruptcy 74067
## 12  2010 Sudden change in Economic Status or Bankruptcy 90476

3)Top 3 states filtration

topstate<-df2%>%filter(!State %in% c("Total (All India)","Total (States)","Total (Uts)"))%>%select(State,Year,Total) %>% group_by(State)%>% 
  summarise(Total=sum(Total)) %>% arrange(desc(Total))%>% head(10)
topstate<-as.data.frame(topstate)
topstate
##             State  Total
## 1     Maharashtra 855611
## 2     West Bengal 709969
## 3  Andhra Pradesh 703486
## 4      Tamil Nadu 696957
## 5       Karnataka 594641
## 6          Kerala 472724
## 7  Madhya Pradesh 391781
## 8         Gujarat 296395
## 9    Chhattisgarh 236825
## 10         Odisha 206601
topstate1<-df2%>%filter(!State %in% c("Total (All India)","Total (States)","Total (Uts)"))%>%
  select(State,Year,Total) %>% group_by(Year,State="Maharashtra")%>%
  summarise(Total=sum(Total)) %>% arrange(desc(State))
## `summarise()` has grouped output by 'Year'. You can override using the `.groups` argument.
topstate2<-df2%>%filter(!State %in% c("Total (All India)","Total (States)","Total (Uts)"))%>%
  select(State,Year,Total) %>% group_by(Year,State="West Bengal")%>%
  summarise(Total=sum(Total)) %>% arrange(desc(State))
## `summarise()` has grouped output by 'Year'. You can override using the `.groups` argument.
topstate3<-df2%>%filter(!State %in% c("Total (All India)","Total (States)","Total (Uts)"))%>%
  select(State,Year,Total) %>% group_by(Year,State="Andhra Pradesh")%>%
  summarise(Total=sum(Total)) %>% arrange(desc(State))
## `summarise()` has grouped output by 'Year'. You can override using the `.groups` argument.
top3state=rbind(topstate1,topstate2,topstate3)
#View(top3state)

3.1)TOP 3 STATES MLR

# Extracting the needed year and suicide count columns

#Testing co relation
#cor(suicide_count_overyears$Year,suicide_count_overyears$total_case)
#cor.test(suicide_count_overyears$Year,suicide_count_overyears$total_case)

#Partitioning into train and test
set.seed(123)
train_samples <- top3state$Year %>% createDataPartition(p=0.80,list=FALSE)
train <- top3state[train_samples,]
test <- top3state[-train_samples,]


# LR Model creation
model <- lm(Total~Year+State,data=train)
summary(model)
## 
## Call:
## lm(formula = Total ~ Year + State, data = train)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -18679  -2843   1619   3272  17744 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -1.781e+07  1.002e+06 -17.781   <2e-16 ***
## Year              9.133e+03  4.993e+02  18.293   <2e-16 ***
## StateMaharashtra -5.237e+02  4.081e+03  -0.128    0.899    
## StateWest Bengal  1.444e+03  4.200e+03   0.344    0.734    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9565 on 28 degrees of freedom
## Multiple R-squared:  0.9247, Adjusted R-squared:  0.9166 
## F-statistic: 114.5 on 3 and 28 DF,  p-value: 7.941e-16
#Make predictions
pred <- model %>%
  predict(test)

pred
##        1        2        3        4 
## 555472.2 466106.9 502639.9 528596.1
#Verification with actual value and predicted values
RMSE <- RMSE(pred,test$Total)
RMSE
## [1] 9520.142
R2 <- R2(pred,test$Total)
R2
## [1] 0.947362
hist(model$residuals)

qqnorm(model$residuals,ylab = "Residuals")
qqline(model$residuals)

# Prediction
new.speeds <- data.frame(
  Year = c(2013, 2014, 2025) , State = c("Maharashtra","West Bengal","Maharashtra"))
#(agedf)
predict(model, newdata = new.speeds)
##        1        2        3 
## 573738.7 584839.2 683337.7
#future_years=data.frame(Year=c(2013:2033))
#future_years$total_case <- model %>%
#  predict(future_years)

3.2)Lasso for top 3 states

#install.packages("glmnet")
library(glmnet)

#perform k-fold cross-validation to find optimal lambda value
cv_model <- cv.glmnet(data.matrix(top3state[, c('Year','State')]), top3state$Total, alpha = 1)
cv_model
## 
## Call:  cv.glmnet(x = data.matrix(top3state[, c("Year", "State")]), y = top3state$Total,      alpha = 1) 
## 
## Measure: Mean-Squared Error 
## 
##     Lambda Index   Measure       SE Nonzero
## min   1124    37  97255652 23366319       1
## 1se   4536    22 116736759 29342687       1
#find optimal lambda value that minimizes test MSE
best_lambda <- cv_model$lambda.min
best_lambda
## [1] 1123.63
#[1] best_lambda=1123.63

#produce plot of test MSE by lambda value
plot(cv_model)

#Best Lasso model
best_model <- glmnet(data.matrix(top3state[, c('Year','State')]), top3state$Total, alpha = 1, lambda = best_lambda)
coef(best_model)
## 3 x 1 sparse Matrix of class "dgCMatrix"
##                        s0
## (Intercept) -17432523.740
## Year             8944.749
## State               .
#Prediction

#define new observation
#new = matrix(c(2005,'Maharashtra'), nrow=1, ncol=2) 
#new
#use lasso regression model to predict response value
#predict(best_model, s = best_lambda, newx = new)

x=data.matrix(top3state[, c('Year','State')])
y=top3state$Total
#Metrics
y_predicted <- predict(best_model, s = best_lambda, newx = x)

#find SST and SSE
sst <- sum((y - mean(y))^2)
sse <- sum((y_predicted - y)^2)

#find R-Squared
rsq <- 1 - sse/sst
rsq
## [1] 0.9261391

3.3)SVM for top 3 states

top3state
## # A tibble: 36 x 3
## # Groups:   Year [12]
##     Year State        Total
##    <int> <chr>        <int>
##  1  2001 Maharashtra 467928
##  2  2002 Maharashtra 476738
##  3  2003 Maharashtra 482322
##  4  2004 Maharashtra 486323
##  5  2005 Maharashtra 486115
##  6  2006 Maharashtra 512676
##  7  2007 Maharashtra 522233
##  8  2008 Maharashtra 531216
##  9  2009 Maharashtra 539470
## 10  2010 Maharashtra 564083
## # ... with 26 more rows
library(e1071)
set.seed(123)

train_samples <- top3state$Total %>% createDataPartition(p=0.80,list=FALSE)
train <- top3state[train_samples,]
test <- top3state[-train_samples,]

#agedf

# MLR Model creation
statemodelsvm <- svm(Total~State,data=train, kernel = 'linear')
summary(statemodelsvm)
## 
## Call:
## svm(formula = Total ~ State, data = train, kernel = "linear")
## 
## 
## Parameters:
##    SVM-Type:  eps-regression 
##  SVM-Kernel:  linear 
##        cost:  1 
##       gamma:  0.3333333 
##     epsilon:  0.1 
## 
## 
## Number of Support Vectors:  32
#Make predictions
pred <- statemodelsvm %>%predict(test)
pred
##      1      2      3      4 
## 517453 517456 517456 517453
#Verification with actual value and predicted values
RMSE <- RMSE(pred,test$Total)
RMSE
## [1] 38165.66
R2 <- R2(pred,test$Total)
R2
## [1] 0.8750611

3.4)Logistic Regression for top 3 states

# Splitting dataset
set.seed(123)
train_samples <- top3state$Year %>% createDataPartition(p=0.65,list=FALSE)
#train_samples
head(train_samples)
##      Resample1
## [1,]         2
## [2,]         3
## [3,]         4
## [4,]         5
## [5,]         7
## [6,]         9
train <- top3state[train_samples,]
test <- top3state[-train_samples,]
train
## # A tibble: 24 x 3
## # Groups:   Year [11]
##     Year State        Total
##    <int> <chr>        <int>
##  1  2002 Maharashtra 476738
##  2  2003 Maharashtra 482322
##  3  2004 Maharashtra 486323
##  4  2005 Maharashtra 486115
##  5  2007 Maharashtra 522233
##  6  2009 Maharashtra 539470
##  7  2010 Maharashtra 564083
##  8  2011 Maharashtra 564376
##  9  2012 Maharashtra 547894
## 10  2002 West Bengal 476738
## # ... with 14 more rows
test
## # A tibble: 12 x 3
## # Groups:   Year [8]
##     Year State           Total
##    <int> <chr>           <int>
##  1  2001 Maharashtra    467928
##  2  2006 Maharashtra    512676
##  3  2008 Maharashtra    531216
##  4  2001 West Bengal    467928
##  5  2010 West Bengal    564083
##  6  2001 Andhra Pradesh 467928
##  7  2005 Andhra Pradesh 486115
##  8  2006 Andhra Pradesh 512676
##  9  2007 Andhra Pradesh 522233
## 10  2009 Andhra Pradesh 539470
## 11  2010 Andhra Pradesh 564083
## 12  2011 Andhra Pradesh 564376
as.factor(top3state$Total)
##  [1] 467928 476738 482322 486323 486115 512676 522233 531216 539470 564083
## [11] 564376 547894 467928 476738 482322 486323 486115 512676 522233 531216
## [21] 539470 564083 564376 547894 467928 476738 482322 486323 486115 512676
## [31] 522233 531216 539470 564083 564376 547894
## 12 Levels: 467928 476738 482322 486115 486323 512676 522233 531216 ... 564376
# Training model
logistic_model <- glm(Total~Year+State, data = train)
logistic_model 
## 
## Call:  glm(formula = Total ~ Year + State, data = train)
## 
## Coefficients:
##      (Intercept)              Year  StateMaharashtra  StateWest Bengal  
##        -17222074              8838              3335              2084  
## 
## Degrees of Freedom: 23 Total (i.e. Null);  20 Residual
## Null Deviance:       2.408e+10 
## Residual Deviance: 1.825e+09     AIC: 513.6
pred <- logistic_model %>%
  predict(test)
pred
##        1        2        3        4        5        6        7        8 
## 465812.2 510001.5 527677.2 464560.5 544101.2 462476.9 497828.3 506666.2 
##        9       10       11       12 
## 515504.0 533179.7 542017.6 550855.5
RMSE <- RMSE(pred,test$Total)
RMSE
## [1] 10771.61
R2 <- R2(pred,test$Total)
R2
## [1] 0.9635767
test
## # A tibble: 12 x 3
## # Groups:   Year [8]
##     Year State           Total
##    <int> <chr>           <int>
##  1  2001 Maharashtra    467928
##  2  2006 Maharashtra    512676
##  3  2008 Maharashtra    531216
##  4  2001 West Bengal    467928
##  5  2010 West Bengal    564083
##  6  2001 Andhra Pradesh 467928
##  7  2005 Andhra Pradesh 486115
##  8  2006 Andhra Pradesh 512676
##  9  2007 Andhra Pradesh 522233
## 10  2009 Andhra Pradesh 539470
## 11  2010 Andhra Pradesh 564083
## 12  2011 Andhra Pradesh 564376

4)WRT AGE GRP Filtration

agedf<-df2%>% select(Year,Age_group,Total)%>% 
  filter(!Age_group=="0-100")%>% 
  filter(!Age_group=="0-100+")%>%
  group_by(Year,Age_group)%>% 
  summarise(Total=sum(Total))%>% arrange(desc(Age_group))
## `summarise()` has grouped output by 'Year'. You can override using the `.groups` argument.
agedf<-as.data.frame(agedf)

4.1)AGE GROUP MLR

# Extracting the needed year and suicide count columns

#Testing co relation
#cor(suicide_count_overyears$Year,suicide_count_overyears$total_case)
#cor.test(suicide_count_overyears$Year,suicide_count_overyears$total_case)

#Partitioning into train and test
set.seed(123)
train_samples <- agedf$Year %>% createDataPartition(p=0.65,list=FALSE)
train <- agedf[train_samples,]
test <- agedf[-train_samples,]


# LR Model creation
model <- lm(Total~Year+Age_group,data=train)
summary(model)
## 
## Call:
## lm(formula = Total ~ Year + Age_group, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5598.3 -1889.8   124.6  1879.7  6777.5 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -1431382.0   279654.4  -5.118 1.21e-05 ***
## Year                715.9      139.4   5.136 1.14e-05 ***
## Age_group15-29    94742.6     1417.1  66.856  < 2e-16 ***
## Age_group30-44    89052.6     1554.9  57.273  < 2e-16 ***
## Age_group45-59    49284.1     1548.2  31.834  < 2e-16 ***
## Age_group60+      15361.8     1493.2  10.288 5.61e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2983 on 34 degrees of freedom
## Multiple R-squared:  0.9951, Adjusted R-squared:  0.9944 
## F-statistic:  1375 on 5 and 34 DF,  p-value: < 2.2e-16
#Make predictions
pred <- model %>%
  predict(test)

pred
##          1          5          8         10         16         18         20 
##  16516.515  19380.157  21527.888  22959.709  52586.557  54018.377  55450.198 
##         22         24         26         27         28         29         35 
##  56882.019  58313.840  90923.214  91639.124  92355.034  93070.945  97366.407 
##         38         44         49         55         57         59 
##  96613.230 100908.692   1154.720   5450.183   6882.004   8313.824
#Verification with actual value and predicted values
RMSE <- RMSE(pred,test$Total)
RMSE
## [1] 2826.255
R2 <- R2(pred,test$Total)
R2
## [1] 0.9947472
hist(model$residuals)

qqnorm(model$residuals,ylab = "Residuals")
qqline(model$residuals)

# Prediction
new.speeds <- data.frame(
  Year = c(2013, 2013, 2013) , Age_group = c("30-44","45-59","0-14"))
#(agedf)
predict(model, newdata = new.speeds)
##         1         2         3 
## 98798.228 59029.750  9745.645
#future_years=data.frame(Year=c(2013:2033))
#future_years$total_case <- model %>%
#  predict(future_years)

4.2)SVM Model for Age wrt Total

library(e1071)
set.seed(123)

train_samples <- agedf$Total %>% createDataPartition(p=0.80,list=FALSE)
train <- agedf[train_samples,]
test <- agedf[-train_samples,]

#agedf

# MLR Model creation
modelsvm1 <- svm(Total~Year+Age_group,data=train, kernel = 'linear')
summary(modelsvm1)
## 
## Call:
## svm(formula = Total ~ Year + Age_group, data = train, kernel = "linear")
## 
## 
## Parameters:
##    SVM-Type:  eps-regression 
##  SVM-Kernel:  linear 
##        cost:  1 
##       gamma:  0.1666667 
##     epsilon:  0.1 
## 
## 
## Number of Support Vectors:  11
#Make predictions
pred <- modelsvm1 %>%predict(test)
pred
##          7         10         11         24         26         33         36 
##  22320.441  24496.043  25221.244  58521.288  89093.369  94169.773  96345.375 
##         40         47         53         54         58 
##  96157.551 101233.955   4757.775   5482.976   8383.778
#Verification with actual value and predicted values
RMSE <- RMSE(pred,test$Total)
RMSE
## [1] 3231.497
R2 <- R2(pred,test$Total)
R2
## [1] 0.9948775
#find SST and SSE
sst <- sum((y - mean(y))^2)
sse <- sum((y_predicted - y)^2)

#find R-Squared
rsq <- 1-sse/sst
rsq
## [1] 0.9261391

4.3)Lasso for age grp

#install.packages("glmnet")
library(glmnet)

#Partitioning into train and test
set.seed(123)
train_samples <- agedf$Total %>%
  createDataPartition(p=0.80,list=FALSE)
train <- agedf[train_samples,]
test <- agedf[-train_samples,]

#agedf

#install.packages("glmnet")
#library(glmnet)

#perform k-fold cross-validation to find optimal lambda value
cv_model <- cv.glmnet(data.matrix(train[, c('Year','Age_group')]), train$Total, alpha = 0.5)
cv_model
## 
## Call:  cv.glmnet(x = data.matrix(train[, c("Year", "Age_group")]), y = train$Total,      alpha = 0.5) 
## 
## Measure: Mean-Squared Error 
## 
##     Lambda Index   Measure        SE Nonzero
## min   6930     1 1.487e+09 161516174       0
## 1se   6930     1 1.487e+09 161516174       0
#find optimal lambda value that minimizes test MSE
best_lambda <- cv_model$lambda.min
best_lambda
## [1] 6929.659
#[1] best_lambda=2565.932

#produce plot of test MSE by lambda value
plot(cv_model)

#Best Lasso model
#t=data.matrix(train[, c('Year','State','Age_group')])
#t
best_model <- glmnet(data.matrix(train[, c('Year','Age_group')]), train$Total, alpha = 0.5, lambda = best_lambda)
coef(best_model)
## 3 x 1 sparse Matrix of class "dgCMatrix"
##                       s0
## (Intercept) 5.521567e+04
## Year        4.214766e-13
## Age_group   .
#Prediction

#define new observation
#new = matrix(c(2015,'Maharashtra',"45-59"), nrow=1, ncol=3) 
#data.matrix(c(2015,'Maharashtra',"45-59"))
#new
#use lasso regression model to predict response value
#predict(best_model, s = best_lambda, newx = new)

x=data.matrix(test[, c('Year','Age_group')])
#x
y=test$Total
#Metrics
y_predicted <- predict(best_model, s = best_lambda, newx = x)


RMSE <- RMSE(y,y_predicted)
RMSE
## [1] 40523.15
R2 <- R2(y,y_predicted)
R2
##         [,1]
## s1 0.0566469
#find SST and SSE
#sst <- sum((y - mean(y))^2)
#sse <- sum((y_predicted - y)^2)

#find R-Squared
#rsq <- 1-sse/sst
#rsq

4.4)Logistic Regression for age grp

#Partitioning into train and test
set.seed(123)
train_samples <- agedf$Year %>% createDataPartition(p=0.65,list=FALSE)
train <- agedf[train_samples,]
test <- agedf[-train_samples,]
train
##    Year Age_group  Total
## 2  2002       60+  19502
## 3  2003       60+  20131
## 4  2004       60+  19608
## 6  2006       60+  20288
## 7  2007       60+  20443
## 9  2009       60+  21485
## 11 2011       60+  21457
## 12 2012       60+  22150
## 13 2001     45-59  48788
## 14 2002     45-59  50101
## 15 2003     45-59  51731
## 17 2005     45-59  52429
## 19 2007     45-59  56164
## 21 2009     45-59  58020
## 23 2011     45-59  58032
## 25 2001     30-44  84609
## 30 2006     30-44  95655
## 31 2007     30-44  95370
## 32 2008     30-44  98751
## 33 2009     30-44  98341
## 34 2010     30-44  98670
## 36 2012     30-44  92987
## 37 2001     15-29  93274
## 39 2003     15-29  95906
## 40 2004     15-29  95084
## 41 2005     15-29  94026
## 42 2006     15-29 101304
## 43 2007     15-29 100250
## 45 2009     15-29 102474
## 46 2010     15-29 109118
## 47 2011     15-29 108921
## 48 2012     15-29 100139
## 50 2002      0-14   5189
## 51 2003      0-14   4923
## 52 2004      0-14   5217
## 53 2005      0-14   4850
## 54 2006      0-14   4773
## 56 2008      0-14   4322
## 58 2010      0-14   5571
## 60 2012      0-14   4461
test
##    Year Age_group  Total
## 1  2001       60+  18613
## 5  2005       60+  20274
## 8  2008       60+  19781
## 10 2010       60+  22748
## 16 2004     45-59  51213
## 18 2006     45-59  54432
## 20 2008     45-59  55497
## 22 2010     45-59  58778
## 24 2012     45-59  57267
## 26 2002     30-44  86718
## 27 2003     30-44  87929
## 28 2004     30-44  87807
## 29 2005     30-44  86708
## 35 2011     30-44  99956
## 38 2002     15-29  94394
## 44 2008     15-29 102831
## 49 2001      0-14   5632
## 55 2007      0-14   4732
## 57 2009      0-14   4848
## 59 2011      0-14   4840
as.factor(agedf$Total)
##  [1] 18613  19502  20131  19608  20274  20288  20443  19781  21485  22748 
## [11] 21457  22150  48788  50101  51731  51213  52429  54432  56164  55497 
## [21] 58020  58778  58032  57267  84609  86718  87929  87807  86708  95655 
## [31] 95370  98751  98341  98670  99956  92987  93274  94394  95906  95084 
## [41] 94026  101304 100250 102831 102474 109118 108921 100139 5632   5189  
## [51] 4923   5217   4850   4773   4732   4322   4848   5571   4840   4461  
## 60 Levels: 4322 4461 4732 4773 4840 4848 4850 4923 5189 5217 5571 ... 109118
# Training model
logistic_modelAG <- glm(Total~Year+Age_group, data = train)
logistic_modelAG
## 
## Call:  glm(formula = Total ~ Year + Age_group, data = train)
## 
## Coefficients:
##    (Intercept)            Year  Age_group15-29  Age_group30-44  Age_group45-59  
##     -1431382.0           715.9         94742.6         89052.6         49284.1  
##   Age_group60+  
##        15361.8  
## 
## Degrees of Freedom: 39 Total (i.e. Null);  34 Residual
## Null Deviance:       6.146e+10 
## Residual Deviance: 302600000     AIC: 761.1
pred <- logistic_modelAG %>%
  predict(test)
pred
##          1          5          8         10         16         18         20 
##  16516.515  19380.157  21527.888  22959.709  52586.557  54018.377  55450.198 
##         22         24         26         27         28         29         35 
##  56882.019  58313.840  90923.214  91639.124  92355.034  93070.945  97366.407 
##         38         44         49         55         57         59 
##  96613.230 100908.692   1154.720   5450.183   6882.004   8313.824
RMSE <- RMSE(pred,test$Total)
RMSE
## [1] 2826.255
R2 <- R2(pred,test$Total)
R2
## [1] 0.9947472
test
##    Year Age_group  Total
## 1  2001       60+  18613
## 5  2005       60+  20274
## 8  2008       60+  19781
## 10 2010       60+  22748
## 16 2004     45-59  51213
## 18 2006     45-59  54432
## 20 2008     45-59  55497
## 22 2010     45-59  58778
## 24 2012     45-59  57267
## 26 2002     30-44  86718
## 27 2003     30-44  87929
## 28 2004     30-44  87807
## 29 2005     30-44  86708
## 35 2011     30-44  99956
## 38 2002     15-29  94394
## 44 2008     15-29 102831
## 49 2001      0-14   5632
## 55 2007      0-14   4732
## 57 2009      0-14   4848
## 59 2011      0-14   4840

5)Wrt State,Age grp,Year Culminated Model Filtration

####
# Extracting the needed year and suicide count columns
topstate1<-df2%>%filter(!State %in% c("Total (All India)","Total (States)","Total (Uts)"))%>%
  select(State,Year,Age_group,Total) %>%
  filter(!Age_group=="0-100")%>% 
  filter(!Age_group=="0-100+")%>%
  group_by(Year,Age_group,State="Maharashtra")%>%
  summarise(Total=sum(Total)) %>% arrange(desc(State))
## `summarise()` has grouped output by 'Year', 'Age_group'. You can override using the `.groups` argument.
topstate2<-df2%>%filter(!State %in% c("Total (All India)","Total (States)","Total (Uts)"))%>%
  select(State,Year,Age_group,Total) %>%
  filter(!Age_group=="0-100")%>% 
  filter(!Age_group=="0-100+")%>%
  group_by(Year,Age_group,State="West Bengal")%>%
  summarise(Total=sum(Total)) %>% arrange(desc(State))
## `summarise()` has grouped output by 'Year', 'Age_group'. You can override using the `.groups` argument.
topstate3<-df2%>%filter(!State %in% c("Total (All India)","Total (States)","Total (Uts)"))%>%
  select(State,Year,Age_group,Total) %>%
  filter(!Age_group=="0-100")%>% 
  filter(!Age_group=="0-100+")%>%
  group_by(Year,Age_group,State="Andhra Pradesh")%>%
  summarise(Total=sum(Total)) %>% arrange(desc(State))
## `summarise()` has grouped output by 'Year', 'Age_group'. You can override using the `.groups` argument.
#topstate3
model=rbind(topstate1,topstate2,topstate3)
modellasso=rbind(topstate1,topstate2,topstate3)
modelsvm=rbind(topstate1,topstate2,topstate3)
modelLog=rbind(topstate1,topstate2,topstate3)

5.1)Wrt State,Age grp,Year Culminated Model Lasso

#Testing co relation
#cor(suicide_count_overyears$Year,suicide_count_overyears$total_case)
#cor.test(suicide_count_overyears$Year,suicide_count_overyears$total_case)

#Partitioning into train and test
set.seed(123)
train_samples <- modellasso$Total %>%
  createDataPartition(p=0.80,list=FALSE)
train <- modellasso[train_samples,]
test <- modellasso[-train_samples,]
#agedf

#install.packages("glmnet")
library(glmnet)

#perform k-fold cross-validation to find optimal lambda value
cv_model <- cv.glmnet(data.matrix(train[, c('Year','State','Age_group')]), train$Total, alpha = 0)
cv_model
## 
## Call:  cv.glmnet(x = data.matrix(train[, c("Year", "State", "Age_group")]),      y = train$Total, alpha = 0) 
## 
## Measure: Mean-Squared Error 
## 
##      Lambda Index   Measure       SE Nonzero
## min  154956    36 1.481e+09 78984660       3
## 1se 4021135     1 1.483e+09 71572128       3
#find optimal lambda value that minimizes test MSE
best_lambda <- cv_model$lambda.min


best_lambda
## [1] 154955.6
#[1] best_lambda=2565.932

#produce plot of test MSE by lambda value
plot(cv_model)

#Best Lasso model
#t=data.matrix(train[, c('Year','State','Age_group')])
#t
best_model <- glmnet(data.matrix(train[, c('Year','State','Age_group')]), train$Total, alpha = 0, lambda = best_lambda)
coef(best_model)
## 4 x 1 sparse Matrix of class "dgCMatrix"
##                       s0
## (Intercept) -403899.5431
## Year            228.4241
## State           667.3442
## Age_group      -375.1431
#Prediction

#define new observation
#new = matrix(c(2015,'Maharashtra',"45-59"), nrow=1, ncol=3) 
#data.matrix(c(2015,'Maharashtra',"45-59"))
#new
#use lasso regression model to predict response value
#predict(best_model, s = best_lambda, newx = new)

x=data.matrix(test[, c('Year','State','Age_group')])
#x
y=test$Total
#Metrics
y_predicted <- predict(best_model, s = best_lambda, newx = x)

y_predicted
##             s1
##  [1,] 54136.65
##  [2,] 52636.08
##  [3,] 54218.36
##  [4,] 53468.07
##  [5,] 54821.93
##  [6,] 54381.77
##  [7,] 54006.63
##  [8,] 54838.62
##  [9,] 55295.47
## [10,] 54920.32
## [11,] 54804.00
## [12,] 54428.85
## [13,] 54053.71
## [14,] 53760.27
## [15,] 55489.27
## [16,] 55114.13
## [17,] 55946.12
## [18,] 54820.69
## [19,] 55049.11
## [20,] 56027.82
## [21,] 54902.39
## [22,] 55881.10
## [23,] 55359.24
## [24,] 57316.66
## [25,] 53322.59
## [26,] 52947.45
## [27,] 53551.01
## [28,] 53404.30
## [29,] 53029.15
## [30,] 53257.58
## [31,] 54611.43
## [32,] 54236.29
## [33,] 54464.71
## [34,] 54317.99
## [35,] 55003.26
## [36,] 55981.97
RMSE <- RMSE(y,y_predicted)
RMSE
## [1] 37658.23
R2 <- R2(y,y_predicted)
R2
##          [,1]
## s1 0.05828778
#find SST and SSE
#sst <- sum((y - mean(y))^2)
#sse <- sum((y_predicted - y)^2)
#sst
#sse
#find R-Squared
#rsq <- 1-sse/sst
#rsq

5.2)SVM Wrt State,Age grp,Year Culminated Model

library(e1071)
set.seed(123)
train_samples <- modelsvm$Total %>%
  createDataPartition(p=0.80,list=FALSE)

train <- modelsvm[train_samples,]
test <- modelsvm[-train_samples,]
#agedf

# MLR Model creation
modelsvm <- svm(Total~Year+Age_group+State,data=train, kernel = 'linear')
summary(modelsvm)
## 
## Call:
## svm(formula = Total ~ Year + Age_group + State, data = train, kernel = "linear")
## 
## 
## Parameters:
##    SVM-Type:  eps-regression 
##  SVM-Kernel:  linear 
##        cost:  1 
##       gamma:  0.125 
##     epsilon:  0.1 
## 
## 
## Number of Support Vectors:  35
#Make predictions
pred <- modelsvm %>%predict(test)
pred
##           1           2           3           4           5           6 
##    552.9759  17064.7043  96668.2941  51764.1445   2986.9527  55009.4469 
##           7           8           9          10          11          12 
##  21932.6578  56632.0980  58254.7492  25177.9601    541.3441  95034.0111 
##          13          14          15          16          17          18 
##  87891.0185  18675.7237   2975.3208  97467.9878   4597.9720  54186.4895 
##          19          20          21          22          23          24 
##  54997.8150 100713.2901  22732.3516  94381.6232  24355.0027   9465.9255 
##          25          26          27          28          29          30 
##  95842.4252  88699.4326  96653.7507  90322.0838  52560.9268  53372.2524 
##          31          32          33          34          35          36 
##   4595.0605  99087.7275  99899.0531  93567.3861  96001.3628   9463.0140
#Verification with actual value and predicted values
RMSE <- RMSE(pred,test$Total)
RMSE
## [1] 2645.219
R2 <- R2(pred,test$Total)
R2
## [1] 0.9950006

5.3)LOGISTIC REGRESSION State,Age grp,Year Culminated Model

##BIG MODEL LOGISTIC REGRESSION
# Splitting dataset
set.seed(123)
train_samplesbm <- modelLog$Year %>% createDataPartition(p=0.65,list=FALSE)
#train_samples
head(train_samplesbm)
##      Resample1
## [1,]         3
## [2,]         4
## [3,]         5
## [4,]         7
## [5,]         8
## [6,]         9
train <- modelLog[train_samplesbm,]
test <- modelLog[-train_samplesbm,]
#train
#test

as.factor(modelLog$Total)
##   [1] 5632   93274  84609  48788  18613  5189   94394  86718  50101  19502 
##  [11] 4923   95906  87929  51731  20131  5217   95084  87807  51213  19608 
##  [21] 4850   94026  86708  52429  20274  4773   101304 95655  54432  20288 
##  [31] 4732   100250 95370  56164  20443  4322   102831 98751  55497  19781 
##  [41] 4848   102474 98341  58020  21485  5571   109118 98670  58778  22748 
##  [51] 4840   108921 99956  58032  21457  4461   100139 92987  57267  22150 
##  [61] 5632   93274  84609  48788  18613  5189   94394  86718  50101  19502 
##  [71] 4923   95906  87929  51731  20131  5217   95084  87807  51213  19608 
##  [81] 4850   94026  86708  52429  20274  4773   101304 95655  54432  20288 
##  [91] 4732   100250 95370  56164  20443  4322   102831 98751  55497  19781 
## [101] 4848   102474 98341  58020  21485  5571   109118 98670  58778  22748 
## [111] 4840   108921 99956  58032  21457  4461   100139 92987  57267  22150 
## [121] 5632   93274  84609  48788  18613  5189   94394  86718  50101  19502 
## [131] 4923   95906  87929  51731  20131  5217   95084  87807  51213  19608 
## [141] 4850   94026  86708  52429  20274  4773   101304 95655  54432  20288 
## [151] 4732   100250 95370  56164  20443  4322   102831 98751  55497  19781 
## [161] 4848   102474 98341  58020  21485  5571   109118 98670  58778  22748 
## [171] 4840   108921 99956  58032  21457  4461   100139 92987  57267  22150 
## 60 Levels: 4322 4461 4732 4773 4840 4848 4850 4923 5189 5217 5571 ... 109118
# Training model
logistic_modelbm <- glm(Total~Year+Age_group+State, data = train)
logistic_modelbm 
## 
## Call:  glm(formula = Total ~ Year + Age_group + State, data = train)
## 
## Coefficients:
##      (Intercept)              Year    Age_group15-29    Age_group30-44  
##       -1520152.5             760.0           95102.8           88184.2  
##   Age_group45-59      Age_group60+  StateMaharashtra  StateWest Bengal  
##          49494.8           15737.9            -119.4            -124.0  
## 
## Degrees of Freedom: 119 Total (i.e. Null);  112 Residual
## Null Deviance:       1.713e+11 
## Residual Deviance: 849300000     AIC: 2251
pred <- logistic_modelbm %>%
  predict(test)
pred
##           1           2           3           4           5           6 
##    531.0476  95633.8662   1291.0691  97913.9306  90995.3119  52305.8882 
##           7           8           9          10          11          12 
##  53825.9311   5091.1763 100193.9949   5851.1978  55345.9740   6611.2192 
##          13          14          15          16          17          18 
##   7371.2407 102474.0593  95555.4406  96315.4621  57626.0384  97075.4835 
##          19          20          21          22          23          24 
##    526.4169  88710.6168  16264.3217   1286.4383  96389.2569  89470.6383 
##          25          26          27          28          29          30 
##  50781.2146  17784.3646  97909.2998  18544.3860  98669.3213  53061.2789 
##          31          32          33          34          35          36 
##  92510.7241  20064.4289  54581.3218  20824.4504   5846.5670  55341.3433 
##          37          38          39          40          41          42 
##  56101.3647  95550.8099  56861.3862  23864.5362  95753.2167   1410.4195 
##          43          44          45          46          47          48 
##  50905.1958  17908.3458   2930.4624  91114.6624  18668.3672  53185.2601 
##          49          50          51          52          53          54 
##  99553.3239   5970.5482  21708.4530   6730.5697 101833.3883  56225.3459 
##          55          56          57          58          59          60 
##   7490.5911 102593.4097  96434.8125  23988.5174   9010.6340  97194.8340
RMSE <- RMSE(pred,test$Total)
RMSE
## [1] 2808.221
R2 <- R2(pred,test$Total)
R2
## [1] 0.9946365
#test

5.4)Wrt State,Age grp,Year Culminated Model MLR

#Testing co relation
#cor(suicide_count_overyears$Year,suicide_count_overyears$total_case)
#cor.test(suicide_count_overyears$Year,suicide_count_overyears$total_case)

#Partitioning into train and test
set.seed(123)
train_samples <- model$Total %>%
  createDataPartition(p=0.80,list=FALSE)
train <- model[train_samples,]
test <- model[-train_samples,]

#agedf

# MLR Model creation
model <- lm(Total~Year+Age_group+State,data=train)
summary(model)
## 
## Call:
## lm(formula = Total ~ Year + Age_group + State, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5030.1 -2021.1  -102.4  2051.1  6593.3 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -1.506e+06  1.366e+05 -11.024   <2e-16 ***
## Year              7.530e+02  6.808e+01  11.061   <2e-16 ***
## Age_group15-29    9.527e+04  7.521e+02 126.677   <2e-16 ***
## Age_group30-44    8.796e+04  7.395e+02 118.955   <2e-16 ***
## Age_group45-59    4.948e+04  7.520e+02  65.801   <2e-16 ***
## Age_group60+      1.579e+04  7.389e+02  21.369   <2e-16 ***
## StateMaharashtra -2.883e+02  5.713e+02  -0.505    0.615    
## StateWest Bengal -1.262e+02  5.813e+02  -0.217    0.828    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2809 on 136 degrees of freedom
## Multiple R-squared:  0.9949, Adjusted R-squared:  0.9946 
## F-statistic:  3760 on 7 and 136 DF,  p-value: < 2.2e-16
#Make predictions
pred <- model %>%
  predict(test)

pred
##           1           2           3           4           5           6 
##    472.9349  16261.6808  97253.4485  51459.6336   2732.0624  54471.8036 
##           7           8           9          10          11          12 
##  20779.9358  55977.8886  57483.9736  23792.1058    634.9710  95909.3996 
##          13          14          15          16          17          18 
##  88599.7117  17929.8019   2894.0985  98168.5271   4400.1835  53880.7972 
##          19          20          21          22          23          24 
##  54633.8397 101180.6971  21695.0144  94624.0517  23201.0994   8918.4385 
##          25          26          27          28          29          30 
##  96788.6794  89478.9915  97541.7219  90985.0765  52500.9495  53253.9920 
##          31          32          33          34          35          36 
##   4526.4208  99800.8494 100553.8919  93997.2465  96256.3740   9044.6758
#Verification with actual value and predicted values
RMSE <- RMSE(pred,test$Total)
RMSE
## [1] 2645.477
R2 <- R2(pred,test$Total)
R2
## [1] 0.9951535
hist(model$residuals)

qqnorm(model$residuals,ylab = "Residuals")
qqline(model$residuals)

# Prediction
new.speeds <- data.frame(
  Year = c(2013, 2013, 2013,2022,2023,2024) , Age_group = c("30-44","45-59","0-14","15-29","60+","30-44") ,State=c("Maharashtra","West Bengal","Andhra Pradesh","Andhra Pradesh","Maharashtra","West Bengal")
)
#(agedf)
predict(model, newdata = new.speeds)
##          1          2          3          4          5          6 
##  97474.186  59152.095   9797.718 111849.529  32828.616 105919.689